rm(list = ls())
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(readxl)
library(dplyr)
library(ggplot2)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(here)
## here() starts at /Users/sukem09
library(readxl)
library(dplyr)
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(quantmod)
## Loading required package: xts
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tsibble)
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:zoo':
##
## index
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(stats)
library(purrr)
library(rlang)
##
## Attaching package: 'rlang'
##
## The following object is masked from 'package:magrittr':
##
## set_names
##
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
library(forecast)
tsdata <- read_excel("~/Desktop/case study/PredictorData2022.xlsx")
ts_quarter <- read_excel("~/Desktop/case study/PredictorData2022.xlsx", sheet = "Quarterly")
ts_year <- read_excel("~/Desktop/case study/PredictorData2022.xlsx", sheet = "Annual")
tsdata$yyyymm <- as.yearmon(as.character(tsdata$yyyymm), format="%Y%m")
ts_quarter$yyyyq <- as.yearqtr(as.character(ts_quarter$yyyyq), format="%Y%q")
ts_year$yyyy <- as.Date(as.character(ts_year$yyyy), format="%Y")
tsdata %<>% select(-CRSP_SPvw, -CRSP_SPvwx)
ts_quarter %<>% select(-CRSP_SPvw, -CRSP_SPvwx)
ts_year %<>% select(-CRSP_SPvw, -CRSP_SPvwx)
vars_monthly <- names(tsdata)
vars_quarterly <-names(ts_quarter) #Contains additional varibales: cay, ik, D3, E3
vars_annual <-names(ts_year) #Contains additional varibales: eqis, ik
setdiff(vars_quarterly,vars_monthly)
## [1] "yyyyq" "cay" "ik" "D3" "E3"
setdiff(vars_annual,vars_monthly)
## [1] "yyyy" "cay" "eqis" "ik"
# Rename the column
colname_mapping <- c(
"time" = "yyyymm",
"time" = "yyyyq",
"time" = "yyyy",
"sp_index" = "Index",
"dividend_12" = "D12",
"earning_12" = "E12",
"book_market" = "b/m",
"treasury_bill" = "tbl",
"cor_AAA" = "AAA",
"cor_BAA" = "BAA",
"lt_yeild" = "lty",
"net_equity" = "ntis",
"risk_free" = "Rfree",
"inflation" = "infl",
"lt_returnrate" = "ltr",
"cbond_return" = "corpr",
"stock_var" = "svar",
"cs_premium" = "csp",
"consum_weal_incomeratio" = "cay",
"investment_capitalratio" = "ik",
"dividend3year" = "D3",
"earning3year" = "E3",
"consum_wealth_monthly" = "caym",
"equity_issuing" = "eqis"
)
#Rename columns
filtered_dict_monthly <- colname_mapping[colname_mapping %in% colnames(tsdata)]
tsdata <- rename(tsdata, !!!filtered_dict_monthly)
filtered_dict_quarterly <- colname_mapping[colname_mapping %in% colnames(ts_quarter)]
ts_quarter <- rename(ts_quarter, !!!filtered_dict_quarterly)
filtered_dict_annual <- colname_mapping[colname_mapping %in% colnames(ts_year)]
ts_year <- rename(ts_year, !!!filtered_dict_annual)
# Transform variables into numeric
for(i in names(tsdata)) {
if(class(tsdata[[i]]) == "character") {
tsdata[[i]] <- as.numeric(tsdata[[i]])
}
}
for(i in names(ts_quarter)) {
if(class(ts_quarter[[i]]) == "character") {
ts_quarter[[i]] <- as.numeric(ts_quarter[[i]])
}
}
for(i in names(ts_year)) {
if(class(ts_year[[i]]) == "character") {
ts_year[[i]] <- as.numeric(ts_year[[i]])
}
}
calculate_returns <- function(input){
input %<>% mutate(returns = as.vector(quantmod::Delt(sp_index)))
input %<>% mutate(excess_returns = returns - risk_free)
return(input)
}
# Create subsets for different frequencies --------------------------------
tsdata %<>% calculate_returns()
ts_quarter %<>% calculate_returns()
ts_year %<>% calculate_returns()
sp_index_plot <- ggplot(tsdata, aes(x = time, y = sp_index)) +
geom_line() +
labs(title = "Time Series Plot of S&P 500 Index", x = "Time", y = "S&P 500 Index")
print(sp_index_plot)
vars_for_ts_plot <- setdiff(names(tsdata) , c("time", "sp_index_lag1m"))
vars_for_scatter_plot <- setdiff(names(tsdata) , c("time", "sp_500", "lag_index", "returns", "excess_returns"))
plot_list_ts <- list()
plot_list_lagged <- list()
# Generate one scatter plot against excess return
for(i in 1:length(vars_for_ts_plot)) {
title_text <- paste("(", letters[i], ") ", vars_for_ts_plot[i], sep = "")
current_plot_ts <- ggplot(data=tsdata, aes(x=time, y=lag(!!sym(vars_for_ts_plot[i])))) +
geom_line() +
ggtitle(title_text) +
theme_bw()
plot_list_ts[[i]] <- current_plot_ts
}
# Create a grid layout of the plots
grid.arrange(grobs = plot_list_ts[1:6], ncol = 2)
grid.arrange(grobs = plot_list_ts[7:12], ncol = 2)
grid.arrange(grobs = plot_list_ts[13:18], ncol = 2)
vars_for_ts_plotq <- setdiff(names(ts_quarter) , c("time", "sp_index_lag1m"))
vars_for_scatter_plotq <- setdiff(names(ts_quarter) , c("time", "sp_500", "lag_index", "returns", "excess_returns"))
plot_list_tsq <- list()
plot_list_laggedq <- list()
for(i in 1:length(vars_for_ts_plotq)) {
title_text <- paste("(", letters[i], ") ", vars_for_ts_plotq[i], sep = "")
current_plot_tsq <- ggplot(data=ts_quarter, aes(x=time, y=lag(!!sym(vars_for_ts_plotq[i])))) +
geom_line() +
ggtitle(title_text) +
theme_bw()
plot_list_ts[[i]] <- current_plot_tsq
}
# Create a grid layout of the plots
grid.arrange(grobs = plot_list_ts[1:6], ncol = 2)
grid.arrange(grobs = plot_list_ts[7:12], ncol = 2)
grid.arrange(grobs = plot_list_ts[13:18], ncol = 2)
grid.arrange(grobs = plot_list_ts[19:14], ncol = 2)
excess_returns_plot <- ggplot(tsdata, aes(x = time, y = excess_returns)) +
geom_line() +
labs(title = "Time Series Plot of Monthly Excess Returns", x = "Time", y = "Excess Return")
print(excess_returns_plot)
excess_returns_plotq <- ggplot(ts_quarter, aes(x = time, y = excess_returns)) +
geom_line() +
labs(title = "Time Series Plot of Quaterly Excess Returns", x = "Time", y = "Excess Return")
print(excess_returns_plotq)
excess_returns_ploty <- ggplot(ts_year, aes(x = time, y = excess_returns)) +
geom_line() +
labs(title = "Time Series Plot of Yearly Excess Returns", x = "Time", y = "Excess Return")
print(excess_returns_ploty)
summary(tsdata$excess_returns)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.299723 -0.022091 0.003065 0.001912 0.028665 0.421222 1
summary(ts_quarter$excess_returns)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.399606 -0.040886 0.011416 0.006462 0.058085 0.861607 1
summary(ts_year$excess_returns)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.48035 -0.08643 0.02497 0.02539 0.15412 0.46217 1
excess_rt <- na.omit(tsdata$excess_returns)
acf_result <- Acf(excess_rt, lag.max = 60, main = "ACF of Excess Return")
pacf_result <- pacf(excess_rt,lag.max = 60, main = "PACF of Excess Return")
Look more likely AR(1) model
excess_rtq <- na.omit(ts_quarter$excess_returns)
acf_resultq <- Acf(excess_rtq, lag.max = 60, main = "ACF of Excess Return")
pacf_resultq <- pacf(excess_rtq,lag.max = 60, main = "PACF of Excess Return")
Look like AR (10)
excess_rty <- na.omit(ts_year$excess_returns)
acf_resulty <- Acf(excess_rty, main = "ACF of Excess Return")
pacf_resulty <- pacf(excess_rty, main = "PACF of Excess Return")
Look like no correlation
ar_modelling <- function(input) {
data_cleaned <- input %>% filter(!is.na(excess_returns))
p_values <- 1:2
bic_values <- numeric(length(p_values))
ar_models <- list()
for (p in p_values) {
AR_model <- arima(data_cleaned$excess_returns, order = c(p, 0, 0), method = "ML")
bic_values[p] <- AIC(AR_model)
ar_models[[p]] <- AR_model
}
best_p <- p_values[which.min(bic_values)]
best_AR <- ar_models[[best_p]]
# Calculate fitted values
fitted_values <- data_cleaned$excess_returns - best_AR$residuals
plot_data <- data.frame(
Date = as.Date(input$time[!is.na(input$excess_returns)]),
ExcessReturns = data_cleaned$excess_returns,
Fitted = fitted_values
)
MSFE <- sqrt(mean(best_AR$residuals^2))
# Plot data
plot <- ggplot(plot_data, aes(x = Date)) +
geom_line(aes(y = ExcessReturns), color = "black") +
geom_line(aes(y = Fitted), color = "red") +
labs(title = paste("Time Series with AR(", best_p, ") Fitted Values")) +
theme_minimal()
result <- list(best_p = best_p, bic_values = bic_values, AR_model = best_AR, MSFE = MSFE, plot = plot)
return(result)
}
result <- ar_modelling(input = tsdata)
print(paste("Best AR(p) order:", result$best_p))
## [1] "Best AR(p) order: 1"
summary(result$AR_model)
##
## Call:
## arima(x = data_cleaned$excess_returns, order = c(p, 0, 0), method = "ML")
##
## Coefficients:
## ar1 intercept
## 0.1145 0.0019
## s.e. 0.0233 0.0012
##
## sigma^2 estimated as 0.002221: log likelihood = 2982.43, aic = -5958.87
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set NaN NaN NaN NaN NaN
print(paste("AIC values for p =", 1:5, ":", result$bic_values))
## [1] "AIC values for p = 1 : -5958.86991226427"
## [2] "AIC values for p = 2 : -5958.38195971994"
## [3] "AIC values for p = 3 : -5958.86991226427"
## [4] "AIC values for p = 4 : -5958.38195971994"
## [5] "AIC values for p = 5 : -5958.86991226427"
print(paste("MSFE:", result$MSFE))
## [1] "MSFE: 0.0471252669186668"
print(result$plot)
ar_modelling <- function(input) {
data_cleaned <- input %>% filter(!is.na(excess_returns))
p_values <- 1:5
bic_values <- numeric(length(p_values))
ar_models <- list()
for (p in p_values) {
AR_model <- arima(data_cleaned$excess_returns, order = c(p, 0, 0), method = "ML")
bic_values[p] <- AIC(AR_model)
ar_models[[p]] <- AR_model
}
best_p <- p_values[which.min(bic_values)]
best_AR <- ar_models[[best_p]]
# Calculate fitted values
fitted_values <- data_cleaned$excess_returns - best_AR$residuals
plot_data <- data.frame(
Date = as.Date(input$time[!is.na(input$excess_returns)]),
ExcessReturns = data_cleaned$excess_returns,
Fitted = fitted_values
)
ar_model_summary <- coef(summary(best_AR))
MSFE <- sqrt(mean(best_AR$residuals^2))
# Plot data
plot <- ggplot(plot_data, aes(x = Date)) +
geom_line(aes(y = ExcessReturns), color = "black") +
geom_line(aes(y = Fitted), color = "red") +
labs(title = paste("Time Series with AR(", best_p, ") Fitted Values")) +
theme_minimal()
result <- list(best_p = best_p, bic_values = bic_values, AR_model = best_AR, AR_model_summary = ar_model_summary, MSFE = MSFE, plot = plot)
return(result)
}
result <- ar_modelling(input = ts_quarter)
print(paste("Best AR(p) order:", result$best_p))
## [1] "Best AR(p) order: 4"
print(paste("BIC values for p =", 1:5, ":", result$bic_values))
## [1] "BIC values for p = 1 : -1115.64419869863"
## [2] "BIC values for p = 2 : -1113.64972456684"
## [3] "BIC values for p = 3 : -1128.31218224036"
## [4] "BIC values for p = 4 : -1136.67976421326"
## [5] "BIC values for p = 5 : -1134.73875396023"
summary(result$AR_model)
##
## Call:
## arima(x = data_cleaned$excess_returns, order = c(p, 0, 0), method = "ML")
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.0011 0.0074 0.1617 -0.1302 0.0065
## s.e. 0.0402 0.0397 0.0397 0.0403 0.0040
##
## sigma^2 estimated as 0.008822: log likelihood = 574.34, aic = -1136.68
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set NaN NaN NaN NaN NaN
print(paste("MSFE:", result$MSFE))
## [1] "MSFE: 0.0939251511495606"
print(result$plot)
result <- ar_modelling(input = ts_year)
print(paste("Best AR(p) order:", result$best_p))
## [1] "Best AR(p) order: 2"
print(paste("BIC values for p =", 1:5, ":", result$bic_values))
## [1] "BIC values for p = 1 : -74.5693588331313"
## [2] "BIC values for p = 2 : -77.0878739547895"
## [3] "BIC values for p = 3 : -76.194208003193"
## [4] "BIC values for p = 4 : -74.8766921143495"
## [5] "BIC values for p = 5 : -76.9190271692618"
summary(result$AR_model)
##
## Call:
## arima(x = data_cleaned$excess_returns, order = c(p, 0, 0), method = "ML")
##
## Coefficients:
## ar1 ar2 intercept
## 0.0318 -0.1735 0.0256
## s.e. 0.0804 0.0810 0.0130
##
## sigma^2 estimated as 0.03331: log likelihood = 42.54, aic = -77.09
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set NaN NaN NaN NaN NaN
print(paste("MSFE:", result$MSFE))
## [1] "MSFE: 0.182520762925912"
print(result$plot)
regressors_all <- union(union(names(ts_year), names(tsdata)), names(ts_quarter)) %>% setdiff(c("risk_free", "sp_index", "returns", "excess_returns", "time"))
regressors_monthly <- names(tsdata) %>% setdiff(c( "risk_free","sp_index", "returns", "excess_returns", "time"))
regressors_quarterly <- names(ts_quarter) %>% setdiff(c( "risk_free","sp_index", "returns", "excess_returns","time"))
regressors_annual <- names(ts_year) %>% setdiff(c("risk_free","sp_index", "returns", "excess_returns", "time"))
lagged_regressors_all <- paste0("lag(", regressors_all, ")")
lagged_regressors_monthly <- paste0("lag(", regressors_monthly, ")")
lagged_regressors_quarterly <- paste0("lag(", regressors_quarterly, ")")
lagged_regressors_annual <- paste0("lag(", regressors_annual, ")")
MSFE_df <- data.frame(monthly=rep(NA,20),
quarterly=rep(NA,20),
annual=rep(NA,20),
row.names = c(regressors_all,"all predictors", "AR(p)"))
datasets <- list(tsdata, ts_quarter, ts_year)
# Create an empty data frame to store MSFE values
MSFE_df_monthly <- data.frame(Model = c(regressors_monthly, "all predictors", "AR(p)"),
MSFE = rep(NA, length(regressors_monthly) + 2))
for (i in 1:length(MSFE_df_monthly$Model)) {
if (MSFE_df_monthly$Model[i] == "AR(p)") {
# Try to calculate MSFE for AR(p) model
tryCatch({
ar_result <- ar_modelling(input = tsdata)
MSFE_df_monthly$MSFE[i] <- ar_result$MSFE_AIC
}, error = function(e) {
cat("Warning: AR(p) model did not converge or fit successfully.\n")
})
} else if (MSFE_df_monthly$Model[i] == "all predictors") {
# LM for all predictors
formula_all_predictors1 <- paste("excess_returns ~", paste(lagged_regressors_monthly, collapse = " + ")) %>% as.formula()
model_all_predictors <- lm(formula_all_predictors1, data = tsdata)
MSFE_df_monthly$MSFE[i] <- mean(model_all_predictors$residuals^2)^0.5
} else {
# LM for single predictor
formula <- paste("excess_returns ~ ", MSFE_df_monthly$Model[i], sep = "") %>% as.formula()
model <- lm(formula, data = tsdata)
MSFE_df_monthly$MSFE[i] <- mean(model$residuals^2)^0.5
}
}
## Warning: AR(p) model did not converge or fit successfully.
# Print or return the MSFE data frame
print(MSFE_df_monthly)
## Model MSFE
## 1 dividend_12 0.04740964
## 2 earning_12 0.04741400
## 3 book_market 0.05292828
## 4 treasury_bill 0.05292782
## 5 cor_AAA 0.05285853
## 6 cor_BAA 0.05285202
## 7 lt_yeild 0.05284913
## 8 net_equity 0.05400716
## 9 inflation 0.05201274
## 10 lt_returnrate 0.05385217
## 11 cbond_return 0.05298089
## 12 stock_var 0.04758575
## 13 cs_premium 0.04606330
## 14 all predictors 0.04470067
## 15 AR(p) NA
# Create an empty data frame to store MSFE values
MSFE_df_quarterly <- data.frame(Model = c(regressors_quarterly, "all predictors", "AR(p)"), MSFE = rep(NA, length(regressors_quarterly) + 2))
for (i in 1:length(MSFE_df_quarterly$Model)) {
if (MSFE_df_quarterly$Model[i] == "AR(p)") {
# Try to calculate MSFE for AR(p) model
tryCatch({
ar_result <- ar_modelling(input = ts_quarter)
MSFE_df_quarterly$MSFE[i] <- ar_result$MSFE_AIC
}, error = function(e) {
cat("Warning: AR(p) model did not converge or fit successfully.\n")
})
} else if (MSFE_df_quarterly$Model[i] == "all predictors") {
# LM for all predictors
formula_all_predictors <- paste("excess_returns ~", paste(lagged_regressors_quarterly, collapse = " + ")) %>% as.formula()
model_all_predictors <- lm(formula_all_predictors, data = ts_quarter)
MSFE_df_quarterly$MSFE[i] <- mean(model_all_predictors$residuals^2)^0.5
} else {
# LM for single predictor
formula <- paste("excess_returns ~ ", MSFE_df_quarterly$Model[i], sep = "") %>% as.formula()
model <- lm(formula, data = ts_quarter)
MSFE_df_quarterly$MSFE[i] <- mean(model$residuals^2)^0.5
}
}
## Warning: AR(p) model did not converge or fit successfully.
# Print or return the MSFE data frame
print(MSFE_df_quarterly)
## Model MSFE
## 1 dividend_12 0.09595532
## 2 earning_12 0.09596016
## 3 book_market 0.10582089
## 4 treasury_bill 0.10619995
## 5 cor_AAA 0.10601085
## 6 cor_BAA 0.10572017
## 7 lt_yeild 0.10609422
## 8 consum_weal_incomeratio 0.07902971
## 9 net_equity 0.10873910
## 10 inflation 0.10465406
## 11 lt_returnrate 0.10875313
## 12 cbond_return 0.10705539
## 13 stock_var 0.09654196
## 14 cs_premium 0.08306658
## 15 investment_capitalratio 0.07619183
## 16 dividend3year 0.07811759
## 17 earning3year 0.08351558
## 18 all predictors 0.05974119
## 19 AR(p) NA
MSFE_df_annual <- data.frame(Model = c(regressors_monthly, "all predictors", "AR(p)"),
MSFE = rep(NA, length(regressors_monthly) + 2))
#annual data
for (i in 1:length(MSFE_df_annual$Model)) {
if (MSFE_df_annual$Model[i] == "AR(p)") {
# Try to calculate MSFE for AR(p) model
tryCatch({
ar_result <- ar_modelling(input = ts_year)
MSFE_df_annual$MSFE[i] <- ar_result$MSFE_AIC
}, error = function(e) {
cat("Warning: AR(p) model did not converge or fit successfully.\n")
})
} else if (MSFE_df_annual$Model[i] == "all predictors") {
# LM for all predictors
formula_all_predictors <- paste("excess_returns ~", paste(lagged_regressors_annual, collapse = " + ")) %>% as.formula()
model_all_predictors <- lm(formula_all_predictors, data = ts_year)
MSFE_df_annual$MSFE[i] <- mean(model_all_predictors$residuals^2)^0.5
} else {
# LM for single predictor
formula <- paste("excess_returns ~ ", MSFE_df_annual$Model[i], sep = "") %>% as.formula()
model <- lm(formula, data = ts_year)
MSFE_df_annual$MSFE[i] <- mean(model$residuals^2)^0.5
}
}
## Warning: AR(p) model did not converge or fit successfully.
# Print or return the MSFE data frame
print(MSFE_df_annual)
## Model MSFE
## 1 dividend_12 0.1842201
## 2 earning_12 0.1833799
## 3 book_market 0.1812140
## 4 treasury_bill 0.1899635
## 5 cor_AAA 0.1882981
## 6 cor_BAA 0.1842969
## 7 lt_yeild 0.1892734
## 8 net_equity 0.1933174
## 9 inflation 0.1913295
## 10 lt_returnrate 0.1932781
## 11 cbond_return 0.1904286
## 12 stock_var 0.1805771
## 13 cs_premium 0.1727220
## 14 all predictors 0.1355022
## 15 AR(p) NA
# MSFE_df_monthly
# MSFE_df_quarterly
# MSFE_df_annual
merged_12 <- merge(MSFE_df_monthly, MSFE_df_quarterly, by="row.names", all=TRUE)
rownames(merged_12) <- merged_12$Row.names
merged_12 %<>% select(-Row.names)
final_merged <- merge(merged_12, MSFE_df_annual, by="row.names", all=TRUE)
rownames(final_merged) <- final_merged$Row.names
final_merged %<>% select(-Row.names)
final_merged
## Model.x MSFE.x Model.y MSFE.y Model
## 1 dividend_12 0.04740964 dividend_12 0.09595532 dividend_12
## 10 lt_returnrate 0.05385217 inflation 0.10465406 lt_returnrate
## 11 cbond_return 0.05298089 lt_returnrate 0.10875313 cbond_return
## 12 stock_var 0.04758575 cbond_return 0.10705539 stock_var
## 13 cs_premium 0.04606330 stock_var 0.09654196 cs_premium
## 14 all predictors 0.04470067 cs_premium 0.08306658 all predictors
## 15 AR(p) NA investment_capitalratio 0.07619183 AR(p)
## 16 <NA> NA dividend3year 0.07811759 <NA>
## 17 <NA> NA earning3year 0.08351558 <NA>
## 18 <NA> NA all predictors 0.05974119 <NA>
## 19 <NA> NA AR(p) NA <NA>
## 2 earning_12 0.04741400 earning_12 0.09596016 earning_12
## 3 book_market 0.05292828 book_market 0.10582089 book_market
## 4 treasury_bill 0.05292782 treasury_bill 0.10619995 treasury_bill
## 5 cor_AAA 0.05285853 cor_AAA 0.10601085 cor_AAA
## 6 cor_BAA 0.05285202 cor_BAA 0.10572017 cor_BAA
## 7 lt_yeild 0.05284913 lt_yeild 0.10609422 lt_yeild
## 8 net_equity 0.05400716 consum_weal_incomeratio 0.07902971 net_equity
## 9 inflation 0.05201274 net_equity 0.10873910 inflation
## MSFE
## 1 0.1842201
## 10 0.1932781
## 11 0.1904286
## 12 0.1805771
## 13 0.1727220
## 14 0.1355022
## 15 NA
## 16 NA
## 17 NA
## 18 NA
## 19 NA
## 2 0.1833799
## 3 0.1812140
## 4 0.1899635
## 5 0.1882981
## 6 0.1842969
## 7 0.1892734
## 8 0.1933174
## 9 0.1913295
# Minimum MSFE
final_merged %>% select(MSFE.x) %>% filter(MSFE.x == min(MSFE.x, na.rm = TRUE))
## MSFE.x
## 14 0.04470067
final_merged %>% select(MSFE.y) %>% filter(MSFE.y == min(MSFE.y, na.rm = TRUE))
## MSFE.y
## 18 0.05974119
final_merged %>% select(MSFE) %>% filter(MSFE == min(MSFE, na.rm = TRUE))
## MSFE
## 14 0.1355022
# Maximum MSFE
final_merged %>% select(MSFE.x) %>% filter(MSFE.x == max(MSFE.x, na.rm = TRUE))
## MSFE.x
## 8 0.05400716
final_merged %>% select(MSFE.y) %>% filter(MSFE.y == max(MSFE.y, na.rm = TRUE))
## MSFE.y
## 11 0.1087531
final_merged %>% select(MSFE) %>% filter(MSFE == max(MSFE, na.rm = TRUE))
## MSFE
## 8 0.1933174
colMeans(is.na(tsdata))
## time sp_index dividend_12 earning_12 book_market
## 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.3300438596
## treasury_bill cor_AAA cor_BAA lt_yeild net_equity
## 0.3223684211 0.3157894737 0.3157894737 0.3157894737 0.3678728070
## risk_free inflation lt_returnrate cbond_return stock_var
## 0.0005482456 0.2768640351 0.3618421053 0.3618421053 0.0926535088
## cs_premium returns excess_returns
## 0.5679824561 0.0005482456 0.0005482456
colMeans(is.na(ts_quarter))
## time sp_index dividend_12
## 0.000000000 0.000000000 0.000000000
## earning_12 book_market treasury_bill
## 0.000000000 0.328947368 0.322368421
## cor_AAA cor_BAA lt_yeild
## 0.315789474 0.315789474 0.315789474
## consum_weal_incomeratio net_equity risk_free
## 0.532894737 0.366776316 0.001644737
## inflation lt_returnrate cbond_return
## 0.277960526 0.361842105 0.361842105
## stock_var cs_premium investment_capitalratio
## 0.092105263 0.567434211 0.500000000
## dividend3year earning3year returns
## 0.769736842 0.421052632 0.001644737
## excess_returns
## 0.001644737
colMeans(is.na(ts_year))
## time sp_index dividend_12
## 0.000000000 0.000000000 0.000000000
## earning_12 book_market treasury_bill
## 0.000000000 0.328947368 0.322368421
## cor_AAA cor_BAA lt_yeild
## 0.315789474 0.315789474 0.315789474
## consum_weal_incomeratio net_equity risk_free
## 0.486842105 0.361842105 0.000000000
## inflation equity_issuing lt_returnrate
## 0.282894737 0.368421053 0.361842105
## cbond_return stock_var cs_premium
## 0.361842105 0.092105263 0.565789474
## investment_capitalratio returns excess_returns
## 0.500000000 0.006578947 0.006578947
formula_all_predictors_monthly <- paste("excess_returns ~", paste(lagged_regressors_monthly, collapse=" + ")) %>% as.formula()
full_model_monthly <- lm(formula_all_predictors_monthly, data=tsdata) # Fit a model with all predictors
step_model_monthly <- stats::step(full_model_monthly, direction="both", trace=0, k=2) # Stepwise regression with BIC
summary(step_model_monthly)
##
## Call:
## lm(formula = excess_returns ~ lag(dividend_12) + lag(book_market) +
## lag(cor_BAA) + lag(lt_yeild) + lag(net_equity) + lag(inflation) +
## lag(cbond_return) + lag(cs_premium), data = tsdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.245787 -0.025638 0.001655 0.028565 0.232978
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0164892 0.0088653 -1.860 0.063267 .
## lag(dividend_12) 0.0026003 0.0006924 3.756 0.000186 ***
## lag(book_market) 0.0473852 0.0131186 3.612 0.000323 ***
## lag(cor_BAA) -0.8899889 0.2951611 -3.015 0.002651 **
## lag(lt_yeild) 0.8316425 0.3346388 2.485 0.013157 *
## lag(net_equity) -0.2693967 0.1167764 -2.307 0.021319 *
## lag(inflation) -1.3707488 0.3561847 -3.848 0.000129 ***
## lag(cbond_return) 0.2461855 0.0855536 2.878 0.004117 **
## lag(cs_premium) 3.8845272 1.0727422 3.621 0.000312 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04504 on 779 degrees of freedom
## (1036 observations deleted due to missingness)
## Multiple R-squared: 0.06139, Adjusted R-squared: 0.05175
## F-statistic: 6.369 on 8 and 779 DF, p-value: 5.003e-08
mean(step_model_monthly$residuals^2)^0.5
## [1] 0.04478429
subset_quarterly <- na.omit(ts_quarter)
formula_all_predictors_quarterly <- paste("excess_returns ~", paste(lagged_regressors_quarterly, collapse = " + ")) %>% as.formula()
# Fit a model with all predictors
full_model_quarterly <- lm(formula_all_predictors_quarterly, data = subset_quarterly)
# Stepwise regression
step_model_quarterly <- stats::step(full_model_quarterly, direction = "both", trace = 0, k = 2)
# Print summary of the stepwise model
summary(step_model_quarterly)
##
## Call:
## lm(formula = excess_returns ~ lag(earning_12) + lag(cor_BAA) +
## lag(lt_yeild) + lag(consum_weal_incomeratio) + lag(cbond_return) +
## lag(stock_var) + lag(cs_premium) + lag(investment_capitalratio),
## data = subset_quarterly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.17223 -0.04043 0.00524 0.03281 0.14851
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.540389 0.298095 -1.813 0.075867 .
## lag(earning_12) -0.007373 0.003677 -2.005 0.050386 .
## lag(cor_BAA) -7.410718 3.444017 -2.152 0.036270 *
## lag(lt_yeild) 8.788780 3.547951 2.477 0.016669 *
## lag(consum_weal_incomeratio) 3.039566 1.522820 1.996 0.051394 .
## lag(cbond_return) 0.506399 0.321821 1.574 0.121901
## lag(stock_var) 9.859165 2.678343 3.681 0.000569 ***
## lag(cs_premium) 33.920562 19.619596 1.729 0.089996 .
## lag(investment_capitalratio) 18.247261 9.543994 1.912 0.061626 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06775 on 50 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3256, Adjusted R-squared: 0.2177
## F-statistic: 3.017 on 8 and 50 DF, p-value: 0.00764
# Calculate RMSE
MSFE_f <- sqrt(mean(step_model_quarterly$residuals^2))
MSFE_f
## [1] 0.06237151
subset_year <- na.omit(ts_year)
formula_all_predictors_annual <- paste("excess_returns ~", paste(lagged_regressors_annual, collapse = " + ")) %>% as.formula()
# Fit a model with all predictors
full_model_annual <- lm(formula_all_predictors_annual, data = subset_year)
# Stepwise regression
step_model_annual <- stats::step(full_model_annual, direction = "both", trace = 0, k = 2)
# Print summary of the stepwise model
summary(step_model_annual)
##
## Call:
## lm(formula = excess_returns ~ lag(earning_12) + lag(book_market) +
## lag(inflation) + lag(equity_issuing) + lag(investment_capitalratio),
## data = subset_year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.282627 -0.112904 0.007809 0.098624 0.313350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.359164 0.279793 1.284 0.20529
## lag(earning_12) 0.005239 0.002589 2.023 0.04852 *
## lag(book_market) 0.520536 0.161828 3.217 0.00230 **
## lag(inflation) -2.314709 0.998531 -2.318 0.02466 *
## lag(equity_issuing) -0.899539 0.320321 -2.808 0.00713 **
## lag(investment_capitalratio) -11.664364 7.604887 -1.534 0.13151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1501 on 49 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2956, Adjusted R-squared: 0.2237
## F-statistic: 4.112 on 5 and 49 DF, p-value: 0.003387
# Calculate RMSE
MSFE_f <- sqrt(mean(step_model_annual$residuals^2))
MSFE_f
## [1] 0.1416521
data_monthly_cleaned <- na.omit(tsdata)
null_model <- lm(excess_returns ~ 1, data = data_monthly_cleaned)
current_formula <- "excess_returns ~ 1"
selected_predictors <- c()
min_AIC <- AIC(null_model)
# Repeat until no more predictors can be added
repeat {
aics <- numeric(length(lagged_regressors_monthly))
for(i in 1:length(lagged_regressors_monthly)) {
# Skip if predictor already selected
if (lagged_regressors_monthly[i] %in% selected_predictors) next
new_formula_string <- paste(current_formula, "+", lagged_regressors_monthly[i])
new_formula <- as.formula(new_formula_string)
new_model <- lm(new_formula, data = data_monthly_cleaned)
aics[i] <- AIC(new_model)
}
# Identify predictor with smallest AIC
min_index <- which.min(aics)
# If the new AIC is smaller, add predictor to model. Otherwise, stop.
if (aics[min_index] < min_AIC) {
selected_predictors <- c(selected_predictors, lagged_regressors_monthly[min_index])
current_formula <- paste(current_formula, "+", lagged_regressors_monthly[min_index])
min_AIC <- aics[min_index]
} else {
break
}
}
# Print the final model formula
print(paste("Final model formula:", current_formula))
## [1] "Final model formula: excess_returns ~ 1 + lag(inflation) + lag(cs_premium) + lag(cbond_return) + lag(net_equity) + lag(stock_var) + lag(dividend_12) + lag(book_market) + lag(cor_BAA) + lag(lt_yeild) + lag(treasury_bill)"
data_monthly_cleaned <- na.omit(ts_quarter)
null_model <- lm(excess_returns ~ 1, data = data_monthly_cleaned)
current_formula <- "excess_returns ~ 1"
selected_predictors <- c()
min_AIC <- AIC(null_model)
# Repeat until no more predictors can be added
repeat {
aics <- numeric(length(lagged_regressors_quarterly))
for(i in 1:length(lagged_regressors_quarterly)) {
# Skip if predictor already selected
if (lagged_regressors_quarterly[i] %in% selected_predictors) next
new_formula_string <- paste(current_formula, "+", lagged_regressors_quarterly[i])
new_formula <- as.formula(new_formula_string)
new_model <- lm(new_formula, data = data_monthly_cleaned)
aics[i] <- AIC(new_model)
}
# Identify predictor with smallest AIC
min_index <- which.min(aics)
# If the new AIC is smaller, add predictor to model. Otherwise, stop.
if (aics[min_index] < min_AIC) {
selected_predictors <- c(selected_predictors, lagged_regressors_quarterly[min_index])
current_formula <- paste(current_formula, "+", lagged_regressors_quarterly[min_index])
min_AIC <- aics[min_index]
} else {
break
}
}
# Print the final model formula
print(paste("Final model formula:", current_formula))
## [1] "Final model formula: excess_returns ~ 1"
data_monthly_cleaned <- na.omit(ts_year)
null_model <- lm(excess_returns ~ 1, data = data_monthly_cleaned)
current_formula <- "excess_returns ~ 1"
selected_predictors <- c()
min_AIC <- AIC(null_model)
# Repeat until no more predictors can be added
repeat {
aics <- numeric(length(lagged_regressors_annual))
for(i in 1:length(lagged_regressors_annual)) {
# Skip if predictor already selected
if (lagged_regressors_annual[i] %in% selected_predictors) next
new_formula_string <- paste(current_formula, "+", lagged_regressors_annual[i])
new_formula <- as.formula(new_formula_string)
new_model <- lm(new_formula, data = data_monthly_cleaned)
aics[i] <- AIC(new_model)
}
# Identify predictor with smallest AIC
min_index <- which.min(aics)
# If the new AIC is smaller, add predictor to model. Otherwise, stop.
if (aics[min_index] < min_AIC) {
selected_predictors <- c(selected_predictors, lagged_regressors_annual[min_index])
current_formula <- paste(current_formula, "+", lagged_regressors_annual[min_index])
min_AIC <- aics[min_index]
} else {
break
}
}
# Print the final model formula
print(paste("Final model formula:", current_formula))
## [1] "Final model formula: excess_returns ~ 1 + lag(investment_capitalratio) + lag(equity_issuing) + lag(book_market) + lag(inflation) + lag(earning_12)"